import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix
Celem projektu jest klasyfikacja różnych gatunków fasoli na podstawie ich cech wizualnych.
data = pd.read_csv('../Data/Dry_Bean_Dataset.csv')
print(data.info())
Area (A): The area of a bean zone and the number of pixels within its boundaries.
Perimeter (P): Bean circumference is defined as the length of its border.
Major axis length (L): The distance between the ends of the longest line that can be drawn from a bean.
Minor axis length (l): The longest line that can be drawn from the bean while standing perpendicular to the main axis.
Aspect ratio (K): Defines the relationship between L and l.
Eccentricity (Ec): Eccentricity of the ellipse having the same moments as the region.
Convex area (C): Number of pixels in the smallest convex polygon that can contain the area of a bean seed.
Equivalent diameter (Ed): The diameter of a circle having the same area as a bean seed area.
Extent (Ex): The ratio of the pixels in the bounding box to the bean area.
Solidity (S): Also known as convexity. The ratio of the pixels in the convex shell to those found in beans.
Roundness (R): Calculated with the following formula: (4piA)/(P^2)
Compactness (CO): Measures the roundness of an object: Ed/L
ShapeFactor1 (SF1)
ShapeFactor2 (SF2)
ShapeFactor3 (SF3)
ShapeFactor4 (SF4)
Class (Seker, Barbunya, Bombay, Cali, Dermosan, Horoz and Sira)
#PODZIAL DANYCH NA ZBIOR TRENINGOWY I WALIDACYJNY
our_data, validator_data = train_test_split(data, test_size = 0.2, random_state = 42)
our_data.to_csv('../Data/our_data.csv', index = False)
validator_data.to_csv('../Data/validator_data.csv', index = False)
X = our_data.drop('Class', axis=1)
y = our_data['Class']
X_train, X_val, y_train, y_val = train_test_split(
X, y, stratify=y, test_size=0.3, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
X_val, y_val, stratify=y_val, test_size=0.3, random_state=42
)
df = our_data
sns.countplot(x='Class', data = df)
X.hist(bins = 40, figsize = (20,15))
#rozkład shapefactor względem klasy
#SF1
plt.figure(figsize=(10, 6))
sns.violinplot(x='Class', y='ShapeFactor1', data=df)
SHAPE FACTOR 1:
-bombay odstaje
-cali i barbunya w podobnym miejscu mają najszerszy punkt
-seker, sira, horoz też na podobnym poziomie
-dermason najbardziej rozstrzelony
#SF2
plt.figure(figsize=(10, 6))
sns.violinplot(x='Class', y='ShapeFactor2', data=df)
SHAPE FACTOR 2:
#SF3
plt.figure(figsize=(10, 6))
sns.violinplot(x='Class', y='ShapeFactor3', data=df)
SHAPE FACTOR 3:
#SF4
plt.figure(figsize=(10, 6))
sns.violinplot(x='Class', y='ShapeFactor4', data=df)
SHAPE FACTOR 4:
plt.figure(figsize=(10, 6))
sns.violinplot(x='Class', y='AspectRation', data=df)
ASPECT RATIO:
sns.pairplot(df, hue='Class')
sns.set(style="whitegrid")
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Perimeter', y='Compactness', hue='Class', data=df, palette='tab10', legend='full')
plt.show()
sns.set(style="whitegrid")
plt.figure(figsize=(15, 10))
sns.scatterplot(x='EquivDiameter', y='Perimeter', hue='Class', data=df, palette='tab10', legend='full')
plt.show()
#SHAPE FACTORS VS. COMPACTNESS
sns.set(style="whitegrid")
shape_factors = ['ShapeFactor1', 'ShapeFactor2', 'ShapeFactor3', 'ShapeFactor4']
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(15, 10))
axes = axes.flatten()
for i, shape_factor in enumerate(shape_factors):
sns.scatterplot(x=shape_factor, y='Compactness', hue='Class', data=df, ax=axes[i], palette='tab10', legend='full')
axes[i].set_title(f"Scatter Plot - {shape_factor} vs Compactness")
axes[i].set_xlabel(shape_factor)
axes[i].set_ylabel('Compactness')
plt.tight_layout()
plt.show()
sns.set(style="whitegrid")
shape_factors = ['ShapeFactor2', 'ShapeFactor3', 'ShapeFactor4']
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15, 10))
axes = axes.flatten()
for i, shape_factor in enumerate(shape_factors):
sns.scatterplot(x=shape_factor, y='ShapeFactor1', hue='Class', data=df, ax=axes[i], palette='tab10', legend='full')
axes[i].set_title(f"Scatter Plot - {shape_factor} vs ShapeFactor1")
axes[i].set_xlabel(shape_factor)
axes[i].set_ylabel('ShapeFactor1')
plt.tight_layout()
plt.show()
sns.set(style="whitegrid")
# Create scatter plot for Minor Axis Length vs Major Axis Length
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Eccentricity', y='MajorAxisLength', hue='Class', data=df, palette='tab10', legend='full')
plt.title("Scatter Plot - Minor Axis Length vs Major Axis Length")
plt.xlabel("Minor Axis Length")
plt.ylabel("Major Axis Length")
# Show the plot
plt.show()
#ASPECT RATION VS. AREA
sns.set(style="whitegrid")
plt.figure(figsize=(8, 6))
sns.scatterplot(x='AspectRation', y='Area', hue='Class', data=df, palette='tab10', legend='full')
# Show the plot
plt.show()
corr = df.drop('Class', axis=1).corr()
plt.figure(figsize=(10,10))
sns.heatmap(corr, annot=True, cmap='coolwarm')
-Minor, Major Axis
-Area
-AspectRation
-ShapeFactor1 + Shapefactor2/3